In [1]:
import pandas as pd
In [3]:
df = pd.read_csv('apps.csv')
In [5]:
print(df.head())
   Unnamed: 0                                                App  \
0           0     Photo Editor & Candy Camera & Grid & ScrapBook   
1           1                                Coloring book moana   
2           2  U Launcher Lite – FREE Live Cool Themes, Hide ...   
3           3                              Sketch - Draw & Paint   
4           4              Pixel Draw - Number Art Coloring Book   

         Category  Rating  Reviews  Size     Installs  Type Price  \
0  ART_AND_DESIGN     4.1      159  19.0      10,000+  Free     0   
1  ART_AND_DESIGN     3.9      967  14.0     500,000+  Free     0   
2  ART_AND_DESIGN     4.7    87510   8.7   5,000,000+  Free     0   
3  ART_AND_DESIGN     4.5   215644  25.0  50,000,000+  Free     0   
4  ART_AND_DESIGN     4.3      967   2.8     100,000+  Free     0   

  Content Rating                     Genres      Last Updated  \
0       Everyone               Art & Design   January 7, 2018   
1       Everyone  Art & Design;Pretend Play  January 15, 2018   
2       Everyone               Art & Design    August 1, 2018   
3           Teen               Art & Design      June 8, 2018   
4       Everyone    Art & Design;Creativity     June 20, 2018   

          Current Ver   Android Ver  
0               1.0.0  4.0.3 and up  
1               2.0.0  4.0.3 and up  
2               1.2.4  4.0.3 and up  
3  Varies with device    4.2 and up  
4                 1.1    4.4 and up  
In [17]:
# DATA PREPARATION
print( df.isnull().sum())
Unnamed: 0        0
App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64
In [13]:
df = df.dropna()
In [15]:
print( df.isnull().sum())
Unnamed: 0        0
App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64
In [19]:
basic_stats = df.describe()
print(basic_stats)
         Unnamed: 0       Rating       Reviews         Size
count   7021.000000  7021.000000  7.021000e+03  7021.000000
mean    5638.433984     4.160704  1.448960e+05    21.767597
std     3079.108366     0.559241  1.024428e+06    22.731237
min        0.000000     1.000000  1.000000e+00     0.000000
25%     3087.000000     4.000000  8.400000e+01     4.900000
50%     5716.000000     4.300000  1.546000e+03    13.000000
75%     8292.000000     4.500000  2.658700e+04    31.000000
max    10840.000000     5.000000  4.489172e+07   100.000000
In [25]:
print(df.dtypes)
Unnamed: 0          int64
App                object
Category           object
Rating            float64
Reviews             int64
Size              float64
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object
In [27]:
# Category Exploration
In [31]:
unique_categories = df['Category'].unique()
print(unique_categories)
['ART_AND_DESIGN' 'AUTO_AND_VEHICLES' 'BEAUTY' 'BOOKS_AND_REFERENCE'
 'BUSINESS' 'COMICS' 'COMMUNICATION' 'DATING' 'EDUCATION' 'ENTERTAINMENT'
 'EVENTS' 'FINANCE' 'FOOD_AND_DRINK' 'HEALTH_AND_FITNESS' 'HOUSE_AND_HOME'
 'LIBRARIES_AND_DEMO' 'LIFESTYLE' 'GAME' 'FAMILY' 'MEDICAL' 'SOCIAL'
 'SHOPPING' 'PHOTOGRAPHY' 'SPORTS' 'TRAVEL_AND_LOCAL' 'TOOLS'
 'PERSONALIZATION' 'PRODUCTIVITY' 'PARENTING' 'WEATHER' 'VIDEO_PLAYERS'
 'NEWS_AND_MAGAZINES' 'MAPS_AND_NAVIGATION']
In [33]:
category_counts = df['Category'].value_counts()
print(category_counts)
Category
FAMILY                 1511
GAME                    832
TOOLS                   625
PERSONALIZATION         274
LIFESTYLE               269
MEDICAL                 266
FINANCE                 258
PRODUCTIVITY            223
BUSINESS                222
SPORTS                  221
PHOTOGRAPHY             204
HEALTH_AND_FITNESS      191
COMMUNICATION           188
SOCIAL                  156
NEWS_AND_MAGAZINES      154
SHOPPING                146
TRAVEL_AND_LOCAL        141
BOOKS_AND_REFERENCE     141
DATING                  122
VIDEO_PLAYERS           112
MAPS_AND_NAVIGATION      94
EDUCATION                88
FOOD_AND_DRINK           72
ENTERTAINMENT            64
AUTO_AND_VEHICLES        63
LIBRARIES_AND_DEMO       60
ART_AND_DESIGN           58
HOUSE_AND_HOME           50
WEATHER                  50
COMICS                   47
PARENTING                44
EVENTS                   38
BEAUTY                   37
Name: count, dtype: int64
In [35]:
import matplotlib.pyplot as plt

category_counts.plot(kind='bar', figsize=(10, 6))
plt.title('App Distribution Across Categories')
plt.xlabel('Category')
plt.ylabel('Number of Apps')
plt.show()
No description has been provided for this image
In [37]:
# Metric Analysis
In [39]:
ratings_summary = df['Rating'].describe()
print(ratings_summary)
count    7021.000000
mean        4.160704
std         0.559241
min         1.000000
25%         4.000000
50%         4.300000
75%         4.500000
max         5.000000
Name: Rating, dtype: float64
In [41]:
size_summary = df['Size'].describe()
print(size_summary)
count    7021.000000
mean       21.767597
std        22.731237
min         0.000000
25%         4.900000
50%        13.000000
75%        31.000000
max       100.000000
Name: Size, dtype: float64
In [43]:
popularity_summary = df['Installs'].describe()
print(popularity_summary)
count           7021
unique            19
top       1,000,000+
freq            1174
Name: Installs, dtype: object
In [45]:
pricing_summary = df['Price'].describe()
print(pricing_summary)
count     7021
unique      68
top          0
freq      6482
Name: Price, dtype: object
In [53]:
# Histogram for app ratings
df['Rating'].hist(bins=20, edgecolor='black')
plt.title('Distribution of App Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

# Box plot for app sizes
df['Size'].plot(kind='box')
plt.title('Distribution of App Sizes')
plt.ylabel('Size (MB)')
plt.show()

# Histogram for app popularity
df['Installs'].hist(bins=20, edgecolor='black')
plt.title('Distribution of App Popularity')
plt.xlabel('Downloads/User Count')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [55]:
# Sentiment Analysis
In [57]:
df2 = pd.read_csv('user_reviews.csv')
In [61]:
print(df2.head())
                     App                                  Translated_Review  \
0  10 Best Foods for You  I like eat delicious food. That's I'm cooking ...   
1  10 Best Foods for You    This help eating healthy exercise regular basis   
2  10 Best Foods for You                                                NaN   
3  10 Best Foods for You         Works great especially going grocery store   
4  10 Best Foods for You                                       Best idea us   

  Sentiment  Sentiment_Polarity  Sentiment_Subjectivity  
0  Positive                1.00                0.533333  
1  Positive                0.25                0.288462  
2       NaN                 NaN                     NaN  
3  Positive                0.40                0.875000  
4  Positive                1.00                0.300000  
In [65]:
import re
In [75]:
def clean_text(text):
    if isinstance(text, str):
        # Remove non-alphabetic characters
        text = re.sub(r'[^A-Za-z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
    else:
        text = ''
    return text
In [71]:
print(df2.columns)
Index(['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity',
       'Sentiment_Subjectivity'],
      dtype='object')
In [77]:
df2['cleaned_reviews'] = df2['Translated_Review'].apply(clean_text)
In [79]:
print(df2[['Translated_Review', 'cleaned_reviews']].head())
                                   Translated_Review  \
0  I like eat delicious food. That's I'm cooking ...   
1    This help eating healthy exercise regular basis   
2                                                NaN   
3         Works great especially going grocery store   
4                                       Best idea us   

                                     cleaned_reviews  
0  i like eat delicious food thats im cooking foo...  
1    this help eating healthy exercise regular basis  
2                                                     
3         works great especially going grocery store  
4                                       best idea us  
In [81]:
# Interactive visualizations
In [87]:
import plotly.express as px
In [89]:
# Example: App distribution across categories
fig = px.bar(df, x='Category', y='Installs', title='App Distribution Across Categories')
fig.show()
In [95]:
# Example: App ratings vs. size
fig = px.scatter(df, x='Rating', y='Size', title='App Ratings vs. Size', hover_data=['App'])
fig.show()
In [97]:
# Example: Distribution of app ratings
fig = px.histogram(df, x='Rating', title='Distribution of App Ratings')
fig.show()
In [99]:
# Example: Customizing the bar chart
fig = px.bar(df, x='Category', y='Installs', title='App Distribution Across Categories',
             labels={'Category': 'Category', 'Installs': 'Number of Apps'},
             color='Category', barmode='group')
fig.update_layout(xaxis_title='Category', yaxis_title='Number of Apps')
fig.show()
In [ ]: